In [2]:

    
import pandas as pd
import numpy as np
import json

from bokeh.resources import INLINE
from ipywidgets import interact
from bokeh.io import push_notebook  ,output_notebook , show , output_file
from bokeh.plotting import figure
from bokeh.models.widgets import Select 
from bokeh.models import HoverTool, ColumnDataSource , CheckboxGroup , CustomJS
from bokeh.layouts import widgetbox , row , layout

import seaborn as sns
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)

store = pd.HDFStore('../../data/processed/orbf_benin.h5')
data = store['data']
store.close()

colormap = json.load(open('../../references/departements_colors.json'))



In [3]:

    
len(data)

a = data.groupby(['filetype_name' , 'entity_id' , 'period']).apply(len)



In [4]:

    
len(a['Facture Mensuelle PMA'])









    Out[4]:





16131

High Level Data description

Full downloaded data has 564549 indicator/facility/months data points. We limit this data set to only data points with a validation, which reduces the data set to 434292 indicator/facility/months, representing 77% of the complete data.



In [5]:

    
data.filetype_name.value_counts()









    Out[5]:





Facture Mensuelle PMA    406503
Name: filetype_name, dtype: int64



In [6]:

    
def make_timeline_data(data) :
    out = len(data)
    data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
    
    perc_right = sum(data['correction']) / len(data)
    
    return pd.DataFrame([[out, perc_right]])

timeline_data = data.groupby(['entity_name' , 'entity_type' ,'period']).apply(make_timeline_data).reset_index()
timeline_data.columns = ['entity_name', 'entity_type' ,'date' , 'level_1' , 'value' , 'percent_right']



In [7]:

    
def make_fac_desc(data) :
    start_reporting = min(data.period)
    n_values = len(data)
    entity_type = data.entity_type.unique()[0]
    data['correction'] = (data.indicator_claimed_value == data.indicator_verified_value)
    
    perc_right = sum(data['correction']) / len(data)
    
    return pd.DataFrame([[start_reporting ,n_values , entity_type , perc_right]])

fac_desc = data.groupby('entity_name').apply(make_fac_desc).reset_index()
fac_desc.columns = ['entity_name' , 'level_1' ,'date_start' , 'n_values' , 'entity_type' , 'percent_right']



In [8]:

    
result = fac_desc.sort_values(['entity_type' ,'date_start', 'n_values' , 'entity_name'], ascending=[ 0 , 0 , 1 , 0])



In [9]:

    
timeline_data.entity_name_cat = data.entity_name.astype('category').cat.set_categories(result['entity_name'], ordered=True)



In [10]:

    
colors = []
for i in range(len(timeline_data)) :
    colors.append(colormap[timeline_data.loc[i , 'entity_type']])

We first look at the complete data to see how frequently correction happens in records.

The color is the type of facility we are considering. The shade is the percentage of reported indicator values that have been validated without being corrected.



In [12]:

    
#output_notebook(INLINE)

output_file('fac_data_quality')



In [ ]:

    
print(np.median(timeline_data.percent_right))

print(np.average(timeline_data.percent_right))

print(np.average(timeline_data.percent_right[timeline_data.percent_right < 1]))

print(np.median(timeline_data.percent_right[timeline_data.percent_right < 1]))



In [14]:

    
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)

ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)









    



C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j



In [13]:

    
source = ColumnDataSource(data=dict(
    xname=timeline_data.date.astype(str).tolist(),
    yname=timeline_data.entity_name.tolist() ,
    colors=colors,
    alphas= timeline_data.percent_right.tolist() ,
    type_fac = timeline_data.entity_type ,
))


TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"

p = figure(title="Monthly Reporting by facility",
           x_axis_location="above", tools=TOOLS, 
           x_range=sorted(list(set(timeline_data.date.astype(str).tolist()))), 
           y_range=result['entity_name'].astype(str).tolist())

p.plot_width = 800
p.plot_height = 1200
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect('xname', 'yname', 0.9, 0.9,  source=source,
       color='colors', alpha='alphas', line_color=None,
       hover_line_color='black', hover_color='colors')
p.select_one(HoverTool).tooltips = [
    ('Fosa', '@yname') ,
    ('Period' , '@xname') ,
    ('% Right' , '@alphas')
]


show(p)

We can see data has been entered since March 2012 in a group of facilities, and is entered regularly until november 2016. A second group of facilities start reporting in July 2015 and appear to stop reporting one year later. Some facilites start reporting around April 2014 and have been reporting since then.

We see facility specific patterns, with some facilities being consistently less corrected than others. This can be added to a period effect, withy apparently less correction at the end of periods. This could be interrogated.

Amount Spent



In [16]:

    
zone_color_dico = json.load(open('../../references/departements_colors.json'))
data['parent_geozone_name_color'] = ""
data['indicator_color'] = ""
for i in sorted(data.parent_geozone_name.unique().tolist()) :
    data.loc[data.parent_geozone_name == i , 'parent_geozone_name_color'] = zone_color_dico[i]



In [17]:

    
def received_amounts(data):
    return data.indicator_montant.sum(skipna = True)

amount_fac = data.groupby(['entity_id' , 'date' , 'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(received_amounts)

amount_fac = amount_fac[amount_fac < 50000000]

amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' , 'date' , 'entity_type' , 'entity_name' ,'parent_geozone_name' , 'parent_geozone_name_color' , 'value']



In [18]:

    
ax = sns.distplot(amount_fac.value , color="b" , bins=50)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,)

ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)









    



C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j



In [19]:

    
gammas = sns.load_dataset("gammas")



In [24]:

    
gammas.head()









    Out[24]:






  
    
      
      timepoint
      ROI
      subject
      BOLD signal
    
  
  
    
      0
      0.0
      IPS
      0
      0.513433
    
    
      1
      0.0
      IPS
      1
      -0.414368
    
    
      2
      0.0
      IPS
      2
      0.214695
    
    
      3
      0.0
      IPS
      3
      0.814809
    
    
      4
      0.0
      IPS
      4
      -0.894992



In [22]:

    
amount_fac.head()









    Out[22]:






  
    
      
      entity_id
      date
      entity_type
      entity_name
      parent_geozone_name
      parent_geozone_name_color
      value
    
  
  
    
      0
      2
      2012-03-01
      Maternité isolée
      Csa Kpovie Mi
      ATLANTIQUE
      #1f78b4
      136500.0
    
    
      1
      2
      2012-04-01
      Maternité isolée
      Csa Kpovie Mi
      ATLANTIQUE
      #1f78b4
      136350.0
    
    
      2
      2
      2012-05-01
      Maternité isolée
      Csa Kpovie Mi
      ATLANTIQUE
      #1f78b4
      189450.0
    
    
      3
      2
      2012-06-01
      Maternité isolée
      Csa Kpovie Mi
      ATLANTIQUE
      #1f78b4
      85650.0
    
    
      4
      2
      2012-07-01
      Maternité isolée
      Csa Kpovie Mi
      ATLANTIQUE
      #1f78b4
      161100.0



In [21]:

    
ax = sns.tsplot(data = amount_fac , time = 'date' , value = 'value'  , color="b" )
ax.set(xlabel='% of correct indicators')
#sns.plt.xlim(0,)

#ax.figure.savefig("../../reports/figures/payment_distribution.pdf", dpi=1200)









    Out[21]:





[<matplotlib.text.Text at 0xc24f845a58>]



In [12]:

    
def make_multiline_data(data):
    xs = [data.date.tolist()]
    ys = [data.value.tolist()]
    color = [data.parent_geozone_name_color.tolist()]
    return pd.DataFrame([[xs[0] , ys[0]]])

amount_fac = amount_fac.groupby(['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color']).apply(make_multiline_data)
amount_fac = amount_fac.reset_index()
amount_fac.columns = ['entity_id' ,'entity_type' , 'entity_name' , 'parent_geozone_name' , 'parent_geozone_name_color', 'level' , 'xs' , 'ys']



In [13]:

    
dept = ['All'] + sorted(data.parent_geozone_name.unique())
indics = ['All'] + sorted(data.indicator_label.unique())



In [14]:

    
output_notebook(INLINE)









    





    
        
        Loading BokehJS ...



In [15]:

    
source = ColumnDataSource(data=dict(xs = [] , ys = [] , color = [], facility_name = [] , legend = []))

TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(x_axis_type="datetime",  plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Amount Paid'
p.multi_line('xs' , 'ys',  source = source , color='color' , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
                                    ("Département" , "@legend")
                                   ]

def update(Zones = "All"):
    df = amount_fac
    if (Zones != "All"):
        df = df[df.parent_geozone_name.str.contains(Zones)==True]
    source.data = dict(
        color = df['parent_geozone_name_color'].tolist() ,
        xs=df['xs'].tolist(),
        ys=df['ys'].tolist(),
        facility_name = df['entity_name'].tolist(),
        legend = df['parent_geozone_name'].tolist(),
    )
    push_notebook()

show(p, notebook_handle=True)
update()



In [16]:

    
interact(update, Zones=dept)









    Out[16]:





<function __main__.update>

There are large variations in the amounts of money paid to different facilities for RBF. Variations happen within facilities as well as between different facilites. Amplitudes of variations do not appear to have evidently defined patterns.

Comparing errors



In [17]:

    
verif_data = data
verif_data['difference_indicateur'] = verif_data.indicator_verified_value - verif_data.indicator_claimed_value
verif_data['perc_variation_indicator'] = verif_data['difference_indicateur'] / verif_data['indicator_claimed_value']



In [18]:

    
verif_data = verif_data[(verif_data.indicator_claimed_value < 5000.0) &
                        (verif_data.perc_variation_indicator < 20) 
                       ]



In [19]:

    
output_notebook(INLINE)









    





    
        
        Loading BokehJS ...



In [20]:

    
source = ColumnDataSource(data=dict(xs = [] , ys = [] , facility_name = [] , departement = [] , indicator = []))

TOOLS = "hover,save,pan,box_zoom,wheel_zoom,reset"
p = figure(plot_height=600, plot_width=700, title="Evolution of amounts paid" , tools=TOOLS)
p.grid.grid_line_alpha=0.3
p.xaxis.axis_label = 'Reported Amount'
p.yaxis.axis_label = '% Variation'
p.circle('xs' , 'ys',  source = source , alpha=0.4)
p.select_one(HoverTool).tooltips = [("Facility", "@facility_name") ,
                                    ("Département" , "@departement") ,
                                    ('Indicator' , "@indicator")
                                   ]

def update(departement = "All" , Indicateurs = 'All'):
    df = verif_data
    if (departement != "All"):
        df = df[df.parent_geozone_name.str.contains(departement)==True]
        indics = ['All'] + sorted(df.indicator_label.unique())
    if (Indicateurs != "All"):
        df = df[df.indicator_label == Indicateurs]
    source.data = dict(
        xs=df['indicator_claimed_value'].tolist(),
        ys=df['perc_variation_indicator'].tolist(),
        facility_name = df['entity_name'].tolist(),
        departement = df['parent_geozone_name'].tolist(),
        indicator = df['indicator_label'].tolist()
    )
    push_notebook()

show(p, notebook_handle=True)
update()



In [21]:

    
interact(update, departement=dept , Indicateurs = indics)









    Out[21]:





<function __main__.update>

Amount Correction



In [22]:

    
def correct_amount(data) :
    data.indicator_claimed_value = sum(data.indicator_claimed_value)
    data.difference_indicateur = sum(data.indicator_verified_value - data.indicator_claimed_value)
    data.claimed_montant = sum(data.indicator_claimed_value * data.indicator_tarif)
    data.difference_montant = sum(data.indicator_montant - data.claimed_montant)
    return pd.DataFrame([[data.claimed_montant.tolist() , 
                          data.difference_montant.tolist() , 
                          data.difference_indicateur.tolist() ,
                        data.indicator_claimed_value.tolist()[0]]])

verif_data_report = data.groupby(['entity_id' , 'entity_name' , 'parent_geozone_name' ,'date']).apply(correct_amount)

verif_data_report = verif_data_report.reset_index()

verif_data_report.columns = ['entity_id' , 'entity_name' , 'parent_geozone_name' , 'date' , 'level_5', 
                      'claimed_montant' , 'difference_montant' , 'difference_indicateur' , 'claimed_indicateur']

verif_data_report['perc_variation_montant'] = verif_data_report['difference_montant'] / verif_data_report['claimed_montant']


#verif_data_report = verif_data_report[(np.abs(verif_data_report.difference_montant) < 500000000)]
print(np.median(verif_data_report.difference_montant))









    



-977555285.0



In [119]:

    
verif_data_report_plot = verif_data_report[(np.abs(verif_data_report.difference_montant) <10000000000)]
ax = sns.distplot(verif_data_report_plot.difference_montant , color="b")
ax.set(xlabel='Correction of payment after data validation')
ax.figure.savefig("../../reports/figures/payment_correction.png", dpi=1200)









    



C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j



In [118]:

    
np.median(verif_data_report.difference_montant[(np.abs(verif_data_report.difference_montant) <10000000000)])









    Out[118]:





-970017945.0



In [ ]:

    
ax = sns.distplot(timeline_data.percent_right[timeline_data.percent_right < 1] , color="b" , bins=12)
ax.set(xlabel='% of correct indicators')
sns.plt.xlim(0,1.1)

ax.figure.savefig("../../reports/figures/correct_dist.pdf", dpi=1200)



In [131]:

    
import matplotlib.pyplot as plt
sns.despine(left=True)

verif_data_report_plot2 = verif_data_report_plot[verif_data_report_plot.entity_name.isin(['Csa Dekanme Cs' , 'Tohoue Csc' , 'Bembe Di' , 'Zounta Mi'])]

g = sns.FacetGrid(verif_data_report_plot2, col="entity_name", col_wrap=2, size=5 , sharex = False , sharey = False )
g = g.map(sns.distplot, "difference_montant")

g.fig.savefig("../../reports/figures/facility_correct.pdf", dpi=1000)









    



C:\Users\grlurton\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    





<matplotlib.figure.Figure at 0x413f0b6d8>



In [ ]:

	ROI	subject	BOLD signal
0	IPS	0	0.513433
1	IPS	1	-0.414368
2	IPS	2	0.214695
3	IPS	3	0.814809
4	IPS	4	-0.894992

	entity_id	date	entity_type	entity_name	parent_geozone_name	parent_geozone_name_color	value
0	2	2012-03-01	Maternité isolée	Csa Kpovie Mi	ATLANTIQUE	#1f78b4	136500.0
1	2	2012-04-01	Maternité isolée	Csa Kpovie Mi	ATLANTIQUE	#1f78b4	136350.0
2	2	2012-05-01	Maternité isolée	Csa Kpovie Mi	ATLANTIQUE	#1f78b4	189450.0
3	2	2012-06-01	Maternité isolée	Csa Kpovie Mi	ATLANTIQUE	#1f78b4	85650.0
4	2	2012-07-01	Maternité isolée	Csa Kpovie Mi	ATLANTIQUE	#1f78b4	161100.0